\[%% % Add your macros here; they'll be included in pdf and html output. %% \newcommand{\R}{\mathbb{R}} % reals \newcommand{\E}{\mathbb{E}} % expectation \renewcommand{\P}{\mathbb{P}} % probability \DeclareMathOperator{\logit}{logit} \DeclareMathOperator{\logistic}{logistic} \DeclareMathOperator{\sd}{sd} \DeclareMathOperator{\var}{var} \DeclareMathOperator{\Normal}{Normal} \DeclareMathOperator{\Poisson}{Poisson} \DeclareMathOperator{\Beta}{Beta} \DeclareMathOperator{\Binom}{Binomial} \DeclareMathOperator{\Gam}{Gamma} \DeclareMathOperator{\Exp}{Exponential} \DeclareMathOperator{\Cauchy}{Cauchy} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator{\Dirichlet}{Dirichlet} \newcommand{\given}{\;\vert\;} \]
t-SNE is a recently population dimension reduction method.
Let’s first make some data. This will be some points distributed around a ellipse in \(n\) dimensions.
n <- 20
npts <- 1e3
xy <- matrix(rnorm(n*npts), ncol=n)
theta <- runif(npts) * 2 * pi
ab <- matrix(rnorm(2*n), nrow=2)
ab[,2] <- ab[,2] - ab[,1] * sum(ab[,1] * ab[,2]) / sqrt(sum(ab[,1]^2))
ab <- sweep(ab, 1, sqrt(rowSums(ab^2)), "/")
for (k in 1:npts) {
dxy <- 4 * c(cos(theta[k]), sin(theta[k])) %*% ab
xy[k,] <- xy[k,] + dxy
}
Here’s what the data look like:
pairs(xy, pch=20)
But there is hidden, two-dimensional structure:
plot(xy %*% t(ab), xlab='dimension 1', ylab='dimension 2',
pch=20, col=rainbow(nrow(xy))[rank(theta)])
Now let’s build the distance matrix.
dmat <- as.matrix(dist(xy))
The quantity to be minimized is Kullback-Leibler distance between \(p\) and \(q\), defined as \[\begin{aligned} \text{KL}(p|q) &= \sum_x p_x \log(p_x/q_x) . \end{aligned}\]
Here is a Stan block.
tsne_block <- '
data {
int N; // number of points
int n; // input dimension
int k; // output dimension
matrix[N,N] dsq; // distance matrix, squared
}
parameters {
real<lower=0> sigma_sq; // in kernel for p
matrix[N,k] y;
}
model {
matrix[N,N] q;
real dt;
matrix[N,N] p;
q[N,N] = 1.0;
for (i in 1:(N-1)) {
q[i,i] = 1.0;
for (j in (i+1):N) {
q[i,j] = 1 / (1 + squared_distance(y[i], y[j]));
q[j,i] = q[i,j];
}
}
for (i in 1:N) {
q[i] = q[i] / (sum(q[i]) - q[i,i]);
}
// create p matrix
p = exp(-dsq/(2*sigma_sq));
for (i in 1:N) {
p[i] = p[i] / (sum(p[i]) - p[i,i]);
}
// compute the target
for (i in 1:N) {
dt = (-1) * sum(p[i] .* log(p[i] ./ q[i]));
target += dt;
}
sigma_sq ~ normal(0, 10);
}'
tk <- 2
tsne_model <- stan_model(model_code=tsne_block)
runtime <- system.time(tsne_optim <- optimizing(tsne_model,
data=list(N=nrow(xy),
n=ncol(xy),
k=tk,
dsq=(dmat/max(dmat))^2)))
## Initial log joint probability = -260.345
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
runtime
## user system elapsed
## 22.040 0.068 22.148
out_y <- do.call(cbind, lapply(1:tk, function (k) {
tsne_optim$par[sprintf("y[%d,%d]", 1:nrow(xy), k)]
} ) )
layout(t(1:2))
plot(out_y, xlab='t-sne 1', ylab='t-sne 2', main="t-SNE",
pch=20, col=rainbow(nrow(xy))[rank(theta)])
plot(xy %*% t(ab), xlab='input 1', ylab='input 2', main="'truth'",
pch=20, col=rainbow(nrow(xy))[rank(theta)])
This time let’s apply the method to a high-dimensional random walk.
n <- 40
npts <- 1e2
rw <- colCumsums(matrix(rnorm(n*npts), ncol=n))
Here’s what the data look like:
pairs(rw, col=adjustcolor(rainbow(npts), 0.5), pch=20)
Now let’s build the distance matrix.
rw_dmat <- as.matrix(dist(rw))
rw_runtime <- system.time(rw_tsne_optim <- optimizing(tsne_model,
data=list(N=nrow(rw),
n=ncol(rw),
k=tk,
dsq=(rw_dmat/max(rw_dmat))^2)))
## Initial log joint probability = -39.0414
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
rw_runtime
## user system elapsed
## 0.301 0.000 0.302
rw_y <- do.call(cbind, lapply(1:tk, function (k) {
rw_tsne_optim$par[sprintf("y[%d,%d]", 1:nrow(xy), k)]
} ) )
plot(rw_y, xlab='t-sne 1', ylab='t-sne 2', main="t-SNE",
pch=20, col=rainbow(nrow(rw)))